# import all libraries needed
import osmnx as ox, geopandas as gpd, pandas as pd, networkx as nx, pandas as pd
from IPython.display import Image
%matplotlib inline
ox.config(log_console=True, use_cache=True)
import scipy as sp
# Import distinctiveness package
from distinctiveness.dc import distinctiveness
# open csv downloaded from San Francisco Open Data
df_buss = pd.read_csv('registered_business_sanfra.csv', low_memory=False)
df_buss.head()
# extract specific columns needed
df_buss_2 = df_buss[['Location Id', 'DBA Name','Street Address', 'City', 'NAICS Code Description','Business Location']]
df_buss_2.head()
# drop all rows with nan values in the columns 'NAICS Code Description' and 'Business Location'
df_buss_3 = df_buss_2[df_buss_2['NAICS Code Description'].notna()]
df_buss_4 = df_buss_3[df_buss_3['Business Location'].notna()]
df_buss_4
# undestrand the number of different categories in the dataset
# count values in the dataframe
df_buss_4['NAICS Code Description'].value_counts()
# this for loop iterate over the dataframe and make a classification according to the type of the business category
category = []
for value in df_buss_4["NAICS Code Description"]:
if value == 'Real Estate and Rental and Leasing Services':
category.append(1)
elif value == 'Professional, Scientific, and Technical Services':
category.append(2)
elif value == 'Construction':
category.append(3)
elif value == 'Retail Trade':
category.append(4)
elif value == 'Food Services':
category.append(5)
elif value == 'Private Education and Health Services':
category.append(6)
elif value == 'Arts, Entertainment, and Recreation':
category.append(7)
elif value == 'Transportation and Warehousing':
category.append(8)
elif value == 'Accommodations':
category.append(9)
elif value == 'Financial Services':
category.append(10)
elif value == 'Certain Services':
category.append(11)
elif value == 'Wholesale Trade':
category.append(12)
elif value == 'Information':
category.append(13)
elif value == 'Administrative and Support Services':
category.append(14)
elif value == 'Multiple':
category.append(15)
elif value == 'Manufacturing':
category.append(16)
elif value == 'Insurance':
category.append(17)
elif value == 'Utilities':
category.append(18)
else:
category.append("fail")
# add new column to the dataframe
df_buss_4["Result"] = category
# print first five lines of the dataset
df_buss_4.head(10)
# export dataset in csv
df_buss_4.to_csv('buss_sanfra_categorized.csv')
# define a point at the corner of California St and Mason St in SF
location_point = (37.7842, -122.4197)
# create network from point, inside bounding box of N, S, E, W each 750m from point
G2 = ox.graph_from_point(location_point, dist=3000, dist_type='bbox', network_type='drive')
G2 = ox.project_graph(G2)
fig, ax = ox.plot_graph(G2, node_size=10, node_color='white', figsize=(20,20))
# save street network as ESRI shapefile to work with in GIS
# specify filepath to create a folder
ox.save_graph_shapefile(G2, filepath='./shape_exported/shape_4')
G2
$D_{1}(i)$ = $\sum_{j=1}^{n} w_{ij} log_{10} \frac{n - 1}{g_{j}^{a}}$
#Calculate the 5 metrics of Distinctiveness Centrality
DC = distinctiveness(G2, normalize = False, alpha = 1, measures=["D1"])
DC = pd.DataFrame(DC).sort_index()
DC.head()
# perform degree centrality for the network
dgc = nx.degree_centrality(G2)
# turn dictionary into dataframe
df_dgc = pd.DataFrame.from_dict(dgc, orient='index', columns=['degree_centrality']).sort_index()
df_dgc.head()
# this line calculates closness centrality
dcc = nx.closeness_centrality(G2)
# turn dictionary into dataframe
df_dcc = pd.DataFrame.from_dict(dcc, orient='index', columns=['closeness_centrality']).sort_index()
df_dcc.head()
dec = nx.eigenvector_centrality_numpy(G2)
# turn dictionary into dataframe
df_dec = pd.DataFrame.from_dict(dec, orient='index', columns=['eigenvector_centrality']).sort_index()
df_dec.head()
final_df = pd.concat([df_dgc, df_dcc, df_dec, DC['D1_in']], axis=1, sort=False)
final_df.head()
# save file in csv
final_df.to_csv('nodes_metrics_business_sanfra_fixed_finaltry.csv')
# open dataset with centrality and nodes coordinates exported from Qgis
full_dataset = pd.read_csv('nodes_coordinates_centrality_full.csv', low_memory=False)
full_dataset.head()
# extract specific columns needed
full_dataset = full_dataset[['osmid', 'lon','lat', 'degree_centrality', 'closeness_centrality','eigenvector_centrality', 'D1_in']]
# print first five lines
full_dataset.head()
# filter nodes with degree centrality bigger than 0.00259
full_dataset_high_centrality = full_dataset.loc[full_dataset['degree_centrality'] >= 0.00259]
full_dataset_high_centrality.head()
# export in csv to visualize it in Qgis
full_dataset_high_centrality.to_csv('nodes_highest_degree_centrality.csv')
# filter nodes with degree centrality bigger than 0.0354
full_dataset_high_closeness = full_dataset.loc[full_dataset['closeness_centrality'] >= 0.0354]
full_dataset_high_closeness.head()
# export in csv to visualize it in Qgis
full_dataset_high_closeness.to_csv('nodes_highest_closeness_centrality.csv')